In [1]:
import altair as alt
import pandas as pd
import os
from toolz.curried import pipe
from vega_datasets import data
from altair import datum

# # Create a new data transformer that stores the files in a directory
# def json_dir(data, data_dir='altairdata'):
#     os.makedirs(data_dir, exist_ok=True)
#     return pipe(data, alt.to_json(filename=data_dir + '/{prefix}-{hash}.{extension}') )

# # Register and enable the new transformer
# alt.data_transformers.register('json_dir', json_dir)
# alt.data_transformers.enable('json_dir')

# Handle large data sets (default shows only 5000)
# See here: https://altair-viz.github.io/user_guide/data_transformers.html
alt.data_transformers.disable_max_rows()

# alt.renderers.enable('jupyterlab')
# alt.renderers.enable("jupyter", offline=True) # to enable these to work in the cbtf
# alt.renderers.enable("mimetype")
Out[1]:
DataTransformerRegistry.enable('default')
In [2]:
df = pd.read_csv("../../data/processed/cleaned_thermometer_data.csv", low_memory=True)
df.head(5)
Out[2]:
Voting_Preference State_Code_FIPS Year_of_Study Thermometer_Liberals Thermometer_Conservatives Thermometer_Gays_and_Lesbians Thermometer_Feminists
0 Democrat 51.0 2020 85.0 30.0 85.0 85.0
1 Democrat 36.0 2020 80.0 50.0 97.0 65.0
2 Major Third Party 32.0 2020 70.0 50.0 70.0 70.0
3 Republican 18.0 2020 30.0 70.0 85.0 60.0
4 Republican 41.0 2020 50.0 50.0 45.0 45.0
In [3]:
df2 = df.drop(columns=['Voting_Preference'])
In [4]:
df_avg = df2.groupby(['State_Code_FIPS', 'Year_of_Study'], as_index=False).mean()

final plot¶

In [5]:
import altair as alt
import pandas as pd
import numpy as np
from vega_datasets import data


thermometer_columns = [
    "Thermometer_Liberals",
    "Thermometer_Conservatives",
    "Thermometer_Gays_and_Lesbians",
    "Thermometer_Feminists"
]

df_avg['Year_of_Study'] = df_avg['Year_of_Study'].astype(int)


df_melt = df_avg.melt(
    id_vars=['State_Code_FIPS', 'Year_of_Study'],
    value_vars=thermometer_columns,
    var_name='Attribute',
    value_name='Value'
)


df_wide = df_melt.pivot(
    index='State_Code_FIPS',
    columns=['Attribute', 'Year_of_Study'],
    values='Value'
).reset_index()


df_wide.columns = [
    col[0] if col[1] == '' 
    else f"{col[0]}_{col[1]}" 
    for col in df_wide.columns.to_flat_index()
]
all_wide_cols = [c for c in df_wide.columns]


year_values = sorted(df_avg['Year_of_Study'].unique(), key=lambda x: int(x))
slider = alt.binding_range(
    min=2000, max=2020, step=4, name='Year: '
)
select_year = alt.selection_point(
    fields=['Year_of_Study'],
    bind=slider,
    value=2020,
)


brush = alt.selection_interval(encodings=['x', 'y'])
select_point = alt.selection_point(fields=['Voting_Preference'], bind='legend', toggle=False)

highlight_state = alt.selection_point(
    fields=['State_Code_FIPS'],
    value=1,
    empty='none'
)


select_attribute = alt.selection_point(
    fields=['Attribute'],  
    bind='legend', 
    toggle=False 
)


x_attribute_dropdown = alt.binding_select(
    options=thermometer_columns, 
    name="Select X-Axis for scatter plot: "
)
select_x_attribute = alt.selection_point(
    fields=["X_Attribute"],
    bind=x_attribute_dropdown,
    value="Thermometer_Liberals"
)

y_attribute_dropdown = alt.binding_select(
    options=thermometer_columns,
    name="Select Y-Axis for scatter plot: "
)
select_y_attribute = alt.selection_point(
    fields=["Y_Attribute"],
    bind=y_attribute_dropdown,
    value="Thermometer_Gays_and_Lesbians"
)

states = alt.topo_feature(data.us_10m.url, 'states')

chart = (
    alt.Chart(states)
    .mark_geoshape()
    .transform_lookup(
        lookup='id',
        from_=alt.LookupData(df_wide, 'State_Code_FIPS', all_wide_cols)
    )
    .transform_fold(
        fold=all_wide_cols,
        as_=['AttributeYear', 'Value']
    )
    .transform_calculate(
        Year_of_Study="parseInt(substring(datum.AttributeYear, length(datum.AttributeYear) - 4, length(datum.AttributeYear)))",
        Attribute="substring(datum.AttributeYear, 0, length(datum.AttributeYear) - 5)"
    )
    .transform_filter(select_year)
    .transform_filter(select_attribute)
    .encode(
        stroke=alt.condition(
            highlight_state,
            alt.value('gray'), 
            alt.value('white')
        ),
        strokeWidth=alt.condition(
            highlight_state,
            alt.value(5), 
            alt.value(2) 
        ),
        strokeOpacity=alt.condition(
            highlight_state,
            alt.value(1),
            alt.value(0.1)
        ),
        color=alt.Color(
            'Value:Q',
            title='Avg Score',
            scale=alt.Scale(scheme='blueorange', domain=[0, 100])
        ),
        tooltip=[
            alt.Tooltip('id:O', title='State FIPS'),
            alt.Tooltip('Value:Q', title='Avg Rating', format='.2f'),
            alt.Tooltip('Year_of_Study:N', title='Year'),
            alt.Tooltip('Attribute:N', title='Selected Attribute')
        ]
    )
    .project(type='albersUsa')
    .properties(
        width=800,
        height=800,
        title='Average Thermometer Ratings per US State'
    )
    .add_params(select_year, highlight_state, select_attribute)
)

regressio_plot = alt.Chart(df).transform_fold(
    thermometer_columns, 
    as_=['Y_Attribute', 'Y_Value']
).transform_fold(
    thermometer_columns,
    as_=['X_Attribute', 'X_Value']
).transform_filter(
    select_x_attribute
).transform_filter(
    select_y_attribute
).mark_point().encode(
    x=alt.X('X_Value:Q', title='Selected X-Axis Thermometer Rating'),
    y=alt.Y('Y_Value:Q', title='Selected Y-Axis Thermometer Rating'),
    color=alt.Color('Voting_Preference:N'),
    opacity=alt.condition(select_point, alt.value(1), alt.value(0)),
    tooltip=[
        alt.Tooltip("State_Code_FIPS:N", title="State FIPS"),
        alt.Tooltip("X_Attribute:N", title="Selected X Attribute"),
        alt.Tooltip("Y_Attribute:N", title="Selected Y Attribute"),
        alt.Tooltip("X_Value:Q", title="X-Axis Rating", format=".2f"),
        alt.Tooltip("Y_Value:Q", title="Y-Axis Rating", format=".2f")
    ]
).transform_filter(
    select_year
).add_params(select_point, select_x_attribute, select_y_attribute, brush
).transform_filter(
    highlight_state
).properties(
    width=250,
    height=250,
    title="Regression Plot of Thermometer Ratings (Dynamic Axes)"
)



bar_plot = alt.Chart(df).transform_fold(
    thermometer_columns, 
    as_=['Y_Attribute', 'Y_Value']
).transform_fold(
    thermometer_columns,
    as_=['X_Attribute', 'X_Value']
).transform_filter(
    select_x_attribute
).transform_filter(
    select_y_attribute, brush
).mark_bar().encode(
    x=alt.X('count()', title='Count'),
    y=alt.Y("Voting_Preference"),
    color=alt.Color('Voting_Preference:N'),
).transform_filter(
    select_year
).add_params(select_point, select_x_attribute, select_y_attribute, brush
).transform_filter(
    highlight_state
).properties(
    width=250,
    height=50
)


def compute_correlation(df):
    corr_matrix = df[[ "Thermometer_Gays_and_Lesbians", "Thermometer_Feminists", "Thermometer_Liberals", "Thermometer_Conservatives"]].corr()
    corr_df = corr_matrix.reset_index().melt(id_vars="index")
    corr_df.columns = ["Attribute_X", "Attribute_Y", "Correlation"]
    
    return corr_df

overall_corr = compute_correlation(df)
df_state_corr = df.groupby(["State_Code_FIPS", "Year_of_Study", "Voting_Preference"])[
    ["Thermometer_Gays_and_Lesbians", "Thermometer_Feminists", "Thermometer_Liberals", "Thermometer_Conservatives"]
].apply(compute_correlation).reset_index()
df_state_corr = df_state_corr.drop(columns=["level_3"])


heatmap = (alt.Chart(df_state_corr).mark_rect().encode(
    alt.X("Attribute_X:N", title="Attribute"),
    alt.Y("Attribute_Y:N", title="Attribute"),
    alt.Color("Correlation:Q", title="Correlation", scale=alt.Scale(scheme="blueorange", domain=[-1, 1], reverse=False) ),
    tooltip=[
        alt.Tooltip("Attribute_X:N", title="Attribute 1"),
        alt.Tooltip("Attribute_Y:N", title="Attribute 2"),
        alt.Tooltip("Correlation:Q", title="Correlation", format=".2f"),
        alt.Tooltip("State_Code_FIPS:N", title="State FIPS"),
        alt.Tooltip("Year_of_Study:N", title="Year of Study"),
        alt.Tooltip("Voting_Preference:N", title="Voting Preference")
    ]
).transform_filter(
    select_year, select_point, highlight_state
).add_params(select_year, select_point)
.properties(
    width=250,
    height=250,
    title="Correlation Matrix of Thermometer Ratings"
))


violin_plot = alt.Chart(df).transform_fold(
    thermometer_columns,
    as_=['Attribute', 'Value']
).transform_density(
    density='Value',  
    as_=['Value', 'Density'],
    extent=[0, 100],  
    groupby=["Attribute", "Year_of_Study", "State_Code_FIPS"]
).mark_area(orient='horizontal').encode(
    alt.Y('Value:Q', title='Thermometer Ratings'),
    alt.X('Density:Q',
          stack='center',
          impute=None,
          title=None,
          axis=alt.Axis(labels=False, values=[0], grid=False, ticks=True)
    ),
    alt.Color('Attribute:N', title='Attribute'),
    column=alt.Column('Attribute:N',
                      header=alt.Header(
                          titleOrient='bottom',
                          labelOrient='bottom',
                          labelPadding=0
                      ),
                      title='Thermometer Attribute'
    ),
    opacity=alt.condition(select_attribute, alt.value(1), alt.value(0.2))
).transform_filter(
    select_year
).transform_filter(
    highlight_state
).add_params( select_attribute
).properties(
    width=200,
    title="Distribution of Thermometer Ratings by Attribute"
)

spacer = alt.Chart(pd.DataFrame({'text': ['']})).mark_text().properties(height=70) 


final_chart = (
  chart | ( violin_plot & ( (regressio_plot & bar_plot) | heatmap))
).resolve_scale( 
    color='independent',
    opacity='independent'
).resolve_legend(
    color='independent',
    opacity='independent'
)

final_chart
Out[5]:
In [ ]:
 
In [ ]:
 
In [ ]: